Attempt to run a simple linear regression on a population of bonds downloaded from Capital IQ to detect outliers and screen for relative value opportunities to identify where to focus a deeper dive work.
The Excel files contain ~70 columns, containing bond-specific descriptive fields, such as coupon, maturity, covenants, etc., as well as company fundamental metrics, such as Net Debt/EBITDA, EV/EBITDA, EBITDA margin, Debt/Capital ratio, etc. The files need to be combined in a Pandas dataframe, which will be cleaned up and used to run a multiple linear regression on the data, targeting YTW as an independent variable. This is just a proof of concept - eventually the independent variable should be either spread-to-worst or option-adjusted-spread.
import re, glob, os, time, datetime
from dateutil.parser import parse
import pandas as pd
pd.options.display.max_columns = None
from xlsxwriter.utility import xl_rowcol_to_cell, xl_cell_to_rowcol
import numpy as np
import scipy
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import blackscholes
%matplotlib inline
from sklearn import ensemble, neural_network, tree, linear_model, kernel_ridge, neighbors, svm, tree, naive_bayes
from sklearn import preprocessing, metrics, decomposition, preprocessing, model_selection, feature_selection
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from IPython.display import display, HTML, clear_output
display(HTML(data="""
<style>
div#notebook-container { width: 95%; font-family: Monaco; font-size: 10pt;}
div#menubar-container { width: 95%; font-family: Monaco; font-size: 10pt;}
div#maintoolbar-container { width: 99%; font-family: Monaco; font-size: 10pt;}
.CodeMirror{font-size: 8pt;}
</style>
"""))
import field_constants
matplotlib.rcParams['figure.figsize'] = (16.0, 8.0)
Run once to build pickle from Excel files downloaded from a broad CapiatalIQ Screen
# dataset_path = os.path.join('datasets', 'us_europe_ds', '20180430')
# files = [f for f in os.listdir(os.path.join(dataset_path, 'excel'))]
# df = pd.read_excel(os.path.join(dataset_path, 'excel', files[0]), skiprows=7)
# for file in files[1:]:
# df = df.append(pd.read_excel(os.path.join(dataset_path, 'excel', file), skiprows=7))
# df.reset_index(inplace=True, drop=True)
# df.to_pickle(os.path.join(dataset_path, 'dataset.pkl'))
Load dataset from a pickled CapIQ screen
dataset_path = os.path.join('datasets', 'us_europe_ds', '20180430')
df = pd.read_pickle(os.path.join(dataset_path, 'dataset.pkl'))
# Map CapitalIQ field names to short, presentable names that can be more easily managed
numeric_fields, short_column_names, column_mapping,\
bond_id_columns,ratings_sorted, numeric_fields_mapped,\
group_categories, sorted_results_mapping, sorted_results_columns = field_constants.get_fields(df)
df.head(3)
The above dataset is clearly messy. Below functions help with cleaning up the dataset and mapping column names to something more readable.
def engineer_features(df, drop_perpetual_maturities=True, ytw_cutoff=100, maturity_cutoff=150, excl_no_data_columns=None,
fill_mean_columns=None, winsorize_columns=(None, None), country_list=None, fillna=0, show_exclusion_metrics=False):
'''Clean up dataset: exclude/include bonds with no leverage metrics,
drop perpetual maturities, screen out by specified yield and maturity cutoffs,
force convert numeric columns, clean up dates, combine local/foreign ratings
and apply specific fixes.
Parameters
----------
df : Pandas DataFrame
Dataframe to apply transformations to.
excl_no_data_columns : list, default: None
Choose which securities to exclude if there is no data.
drop_perpetual_maturities : bool, default: True
Choose whether to drop securities with perpetual maturities.
ytw_cutoff: int, default: 100
Filter out securities with yields higher than the specified threshold.
ytw_cutoff: int, default: 150
Filter out securities with maturities (in years) higher than the specified threshold.
show_exclusion_metrics: bool, default: False
Print the exclusion metrics.
fill_mean_columns: list, default: None
Fill sector/country averages for NaN values in selected columns.
winsorize_columns: tuple(list, float), default: (None, None)
A tuple of (list, float), specifying a list of columns to winsorize extreme
values on and at which percentile.
country_list: list, default: None
List of countries to include by 'Country of Incorporation'.
fillna: str or int, default: 0
Fill all NaN datapoints in the dataframe with the specified value.
Returns
----------
Clean pandas dataframe with selected transformations applied.
'''
exclusion_metrics = {}
df = df.copy()
if show_exclusion_metrics:
if country_list is not None:
exclusion_metrics['Country Exclusions'] = (~df['Country of Incorporation'].isin(country_list)).sum()
exclusion_metrics['YTW Cutoff'] = (df['YTW'] >= ytw_cutoff).sum()
exclusion_metrics['Perpetuals'] = (df['Maturity Date'] == 'Perpetual').sum()
if excl_no_data_columns is not None:
for col in excl_no_data_columns:
exclusion_metrics['No data: {}'.format(col)] = df[col].isnull().sum()
df = df.loc[df['YTW'] < ytw_cutoff, :]
df = df[df['Country of Incorporation'].isin(country_list)] if country_list is not None else df
if excl_no_data_columns is not None:
for col in col in excl_no_data_columns:
df = df[df[col].notna()]
df = df.pipe(transform_perpetuals, drop=drop_perpetual_maturities)
df = df.pipe(date_cleanup)
exclusion_metrics['Maturity Cutoff'] = (df['Years to Maturity'] >= maturity_cutoff).sum()
df = df.loc[df['Years to Maturity'] < maturity_cutoff, :]
df = df.pipe(ratings_cleanup)
df = df.pipe(combine_ratings)
df = df.pipe(specific_fixes)
"Add calcluated features, such as EV/EBITDA, winsorize extreme values, and calculate callable bond delta/premium."
df = df.pipe(add_features)
df = df.pipe(winsorize, winsorize_columns[0], winsorize_columns[1]) if winsorize_columns[0] is not None else df
df['Call Delta'] = df[['Price', 'Next Call Price', 'Years to Next Call']].apply(get_bond_delta, axis=1)
df['Call Premium'] = ((df['Next Call Price'] / df['Price'])**(1/df['Years to Next Call'].clip(lower=1)) - 1) * 100
mask = df_clean['Sector'] == '-'
df.loc[mask, 'Sector'] = df.loc[mask, 'Sector (Parent)']
# df = df.pipe(add_zscore)
df = df.pipe(fillna_mean, columns=fill_mean_columns, group=['Sector', 'Country of Incorporation']) if fill_mean_columns is not None else df
df = df.fillna(fillna)
df = df.sort_values('Issuer')
[print('{}: {}'.format(k, v)) for k, v in exclusion_metrics.items() if show_exclusion_metrics]
return df
def clean_df(df):
'Rename columns to reasonably mapped names, force numeric fields.'
df = df.copy()
df = df.rename(column_mapping, axis=1)
df.loc[:, numeric_fields_mapped] = df.loc[:, numeric_fields_mapped].apply(pd.to_numeric, errors='coerce')
return df
def get_bond_delta(row):
"Get the 'delta' of a callable bond using a simple Black model. To be used with df.apply() pandas function"
s, k, t = row.values
if np.isnan(k) or t <= 0:
return np.nan
try:
delta = blackscholes.BSMerton([1, s, k, .02, .00, t*365, .2]).delta()
except:
return np.nan
return delta[0]
def add_features(df):
'Add various calculated fields.'
df.loc[:, 'EV/EBITDA Fwd'] = df['EV'] / df['NTM EBITDA']
df.loc[:, 'EV/EBITDA Fwd (Parent)'] = df['EV (Parent)'] / df['NTM EBITDA (Parent)']
df.loc[:, 'Net Debt/EBITDA'] = df['Net Debt (Parent)'] / df['EBITDA (Parent)']
df.loc[:, 'Net Debt/EBITDA Fwd (Parent)'] = df['Net Debt (Parent)'] / df['NTM EBITDA (Parent)']
df.loc[:, 'Current Yield'] = (df['Coupon'] / df['Price']) * 100
# df.loc[:, 'Net Debt/EBITDA Fwd (Parent)'] = df.loc[:, 'Net Debt/EBITDA Fwd (Parent)'].fillna(df['Net Debt/EBITDA'])
return df
def transform_perpetuals(df, drop=True, set_maturity='2118-12-31'):
'Drop or set to specific maturity.'
df = df.copy()
if drop:
df = df.loc[df['Maturity Date'] != 'Perpetual', :]
if set_maturity is not None:
df.loc[df['Maturity Date'] == 'Perpetual', 'Maturity Date'] = parse('2118-12-31')
return df
def date_cleanup(df):
'''Convert perpetual maturities to specific long-term maturities to allow for computations.
Add Years to Maturity and Years to Next Call fields.'''
df['Years to Maturity'] = (pd.to_datetime(df['Maturity Date'])-datetime.datetime.today())/datetime.timedelta(days=1)/365
df['Years to Next Call'] = pd.to_datetime(df['Next Call Date'], errors='coerce').apply(
lambda x: np.round((x-datetime.datetime.today())/datetime.timedelta(days=1)/365, decimals=2) if pd.notna(x) else np.nan)
return df
def ratings_cleanup(df):
'Remove S&P specific designators from ratings.'
df.loc[:, 'Rating (Local Currency)'] = df.loc[:, 'Rating (Local Currency)'].apply(lambda x: x.replace(' (sf)', ''))
df.loc[:, 'Rating (Foreign Currency)'] = df.loc[:, 'Rating (Foreign Currency)'].apply(lambda x: x.replace(' (sf)', ''))
return df
def combine_ratings(df):
'Combine local and foreign currency ratings from the S&P.'
df['Rating'] = df['Rating (Local Currency)'].replace('-', np.nan).fillna(df['Rating (Foreign Currency)'])
df['Rating'] = df['Rating'].replace('AA-/A-1+', '')
return df
def specific_fixes(df):
# fix Roche bond call price from 1000 to 100
df.loc[df['Security ID'] == 'IQT315939319', 'Next Call Price'] = 100
# fix Intel bond call price from 1000 to 100
df.loc[df['Security ID'] == 'IQT338089502', 'Next Call Price'] = 100
return df
def encode_ratings(df):
'Encoding ratings in a single field based on a sorted list of ratings, rather than one-hot encoder.'
encoded_ratings_dict = dict([(k, v) for k, v in zip(ratings_sorted, range(1,len(ratings_sorted)+1))])
df['Rating'] = df['Rating'].apply(lambda x: encoded_ratings_dict[x])
return df
def winsorize(df, columns, percentile=0.005):
'''Winsorization function based on percentile.
Parameters
----------
columns : list
list of columns to apply winsorization to
percentile : float, default: 0.005
Percentile to winsorize to
Returns
----------
Pandas dataframe with winsorization applied to selected columns.
'''
numeric_fields_mapped = [column_mapping[field] for field in numeric_fields]
for field in columns:
df.loc[df[field].notna(), field] = scipy.stats.mstats.winsorize(df.loc[df[field].notna(), field], limits=(0, percentile))
return df
def fillna_mean(df, columns, group):
'''Fill NaN with means from a group.
Parameters
----------
columns : list
list of columns to apply group means to NaN values
group : string
Column name to group by
Returns
----------
Pandas dataframe with group means applied to selected columns.
'''
df.loc[:, columns] = df.groupby(group)[columns].transform(lambda x: x.fillna(x.mean()))
return df
def add_zscore(df):
'Transform values to z-scores for all numerical columns.'
numeric_fields_mapped = [column_mapping[field] for field in numeric_fields]
numeric_fields_zscore = [field +'_zscore' for field in numeric_fields_mapped]
df_zscores = df.loc[:, numeric_fields_mapped].apply(lambda x: (x - x.mean())/x.std(ddof=0))
df_zscores.columns = numeric_fields_zscore
df = df.join(df_zscores)
return df
def missing_values_table(df):
mis_val = df.isnull().sum()
mis_val_percent = 100 * df.isnull().sum() / len(df)
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")
return mis_val_table_ren_columns
fill_mean_columns = ['Net Debt/EBITDA', 'Net Debt/EBITDA Fwd (Parent)', 'EV/EBITDA Fwd (Parent)', '2y FWD EBITDA Growth (Parent)']
country_list = ['France', 'Italy', 'United States', 'Switzerland', 'United Kingdom', 'Sweden', 'Finland', 'Germany', 'Spain', 'Belgium', 'Canada', 'Australia']
df_clean = clean_df(df)
df_analysis = engineer_features(df_clean, excl_no_data_columns=None, drop_perpetual_maturities=True, ytw_cutoff=25,
maturity_cutoff=100, fill_mean_columns=fill_mean_columns, country_list=country_list,
fillna=0, show_exclusion_metrics=True)
fig, ax = plt.subplots(1, figsize=(15,8))
ax.scatter(df_analysis['Years to Maturity'],
df_analysis['YTW']);
ax.set_xlabel('Years to Maturity')
ax.set_ylabel('Yield to Worst');
print('Number of bonds selected/total: {}/{} (excl: {})'.format(df_analysis.shape[0], df.shape[0], df.shape[0]-df_analysis.shape[0]))
features = [
'Seniority',
'Coupon',
'Coupon Type',
'Amount Outstanding',
'Rating',
'Duration',
'Currency',
'Sector',
'Primary Industry',
'Country of Incorporation',
'Years to Maturity',
'Security Type',
'Next Call Price',
# 'Call Premium',
'Call Delta',
'Conversion Premium',
'Net Debt/EBITDA',
'Net Debt/EBITDA Fwd (Parent)',
'EV/EBITDA Fwd',
'EV/EBITDA Fwd (Parent)',
'LTM EBITDA Growth (Parent)',
'2y FWD EBITDA Growth (Parent)',
'2y Fwd Revenue Growth (Parent)',
'EBITDA Margin (Parent)',
'Total Debt/Capital',
'Total Debt/Capital (Parent)',
# 'Price Change (3m %)',
# 'Price',
# 'Current Yield',
# 'Years to Next Call',
# 'No of Analysts (Parent)',
# 'Price Change (3m %)',
]
le = preprocessing.LabelEncoder()
lb = preprocessing.LabelBinarizer()
mlb = preprocessing.MultiLabelBinarizer()
The callable bond delta calulation takes a ~5 seconds - see if this can be optimized.
fill_mean_columns = ['Net Debt/EBITDA', 'Net Debt/EBITDA Fwd (Parent)', 'EV/EBITDA Fwd (Parent)', '2y FWD EBITDA Growth (Parent)']
country_list = ['France', 'Italy', 'United States', 'Switzerland', 'United Kingdom', 'Sweden', 'Finland', 'Germany', 'Spain', 'Belgium', 'Canada', 'Australia']
df_clean = clean_df(df)
df_analysis = engineer_features(df_clean, excl_no_data_columns=None, drop_perpetual_maturities=True, ytw_cutoff=25,
maturity_cutoff=100, fill_mean_columns=fill_mean_columns, winsorize_columns=(['Net Debt/EBITDA'], 0.002),
country_list=country_list, fillna=0, show_exclusion_metrics=True)
df_analysis.head(3)
Parsing the list of covenants takes the most time (~10 sec).
train = df_analysis[features].copy()
# apply_zscore = ['Amount Outstanding', 'Net Debt/EBITDA']
# train[apply_zscore] = train.groupby(['Country', 'Sector']).transform(lambda x: (x - x.mean())/x.std())[apply_zscore]
# get one-hot encodings for all categoricals
train = pd.get_dummies(train)
train = train.drop('Rating_-', axis=1, errors='ignore')
# get one-hot encodings for all covenants
train['BP'] = df_analysis['Bondholder Protective'].apply(lambda x: x.split('; ')).apply(frozenset).to_frame(name='BP')
train['IR'] = df_analysis['Issuer Restrictive'].apply(lambda x: x.split('; ')).apply(frozenset).to_frame(name='IR')
for bp in frozenset.union(*train.BP):
train[bp] = train.apply(lambda _: int(bp in _.BP), axis=1)
train = train.drop(['-', 'BP'], axis=1)
for ir in frozenset.union(*train.IR):
train[ir] = train.apply(lambda _: int(ir in _.IR), axis=1)
train = train.drop(['-', 'IR'], axis=1)
feature_names = train.columns
df_train_copy = train.copy()
# preprocess the dataset (standard scaler, etc)
pipeline = Pipeline([
('scale', preprocessing.StandardScaler()),
# ('pca', decomposition.PCA()),
])
df_train_copy.head()
if __name__ == '__main__':
# Feature importances
clf_rf = ensemble.RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_depth=30, oob_score=True, n_jobs=-1)
clf_rf.fit(pipeline.fit_transform(df_train_copy), df_analysis['YTW'])
df_feature_importances = pd.DataFrame(clf_rf.feature_importances_, index=df_train_copy.columns, columns=['importance']).sort_values('importance', ascending=False)
top_features = 50
fig, ax = plt.subplots(1, figsize=(30,8))
ax.bar(df_feature_importances.head(top_features).index, df_feature_importances.head(top_features).importance, tick_label=df_feature_importances.head(top_features).index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right');
ax.set_ylim(0, .4);
ax.set_ylabel('Feature Importance');
# use feature importance for feature selection
select_top_features = 250
feature_names = df_feature_importances.head(select_top_features).index
df_train = df_train_copy[feature_names].copy()
train = pipeline.fit_transform(df_train)
df_stats = df_analysis[features + ['Price Change (3m %)']].groupby('Sector').describe()
df_stats.loc[:, (slice(None), ['mean'])].applymap(lambda x: '{:.2f}'.format(x))
y_field = 'Price Change (3m %)'
splot = sns.violinplot(x='Sector', y=y_field, hue='Country', split=True,
data=df_analysis.assign(Country = lambda x:df_analysis['Country of Incorporation'].apply(lambda x: 'US' if x == 'United States' else 'Non-US')))
splot.set_ylim(df_analysis[y_field].quantile([.001, .999]).values)
[(x.set_rotation(45), x.set_horizontalalignment('right')) for x in splot.get_xticklabels()]
splot.set_title("3m % Price Change Distribution by Sector", fontsize=15);
matplotlib.rcParams['figure.figsize'] = (16.0, 8.0)
df_analysis.replace('-', np.nan).pivot_table(index='Sector', aggfunc='count').div(
df_analysis.pivot_table(index='Sector', aggfunc='size'), axis=0).applymap(lambda x: '{:.0f}%'.format(x*100)).drop('-')
Adding any combination of sk-learn regressors to run the dataset through. Regressors can be chosen using the runthrough of all linear regressors in scikit-learn at the bottom of the notebook.
sk_regressors = {
'BRidge': linear_model.BayesianRidge(n_iter=100000, tol=1e-3, alpha_1=10**-6, alpha_2=10**-0.25, lambda_1=10**-6, lambda_2=10**-6),
'ElasticNet': linear_model.ElasticNet(1e-06, l1_ratio=0.1, max_iter=1000000, normalize=True, random_state=1, tol=1e-05, warm_start=False),
# 'ElasticNet0': linear_model.ElasticNet(alpha=1e-05, copy_X=True, fit_intercept=True, l1_ratio=0.3, max_iter=100000, normalize=False,
# positive=False, precompute=False, random_state=1, selection='cyclic', tol=1e-05, warm_start=False),
# 'ElasticNet1': linear_model.ElasticNet(1e-05, l1_ratio=0.1, max_iter=1000000, normalize=True, random_state=1, tol=1e-05, warm_start=False),
# 'ElasticNet2': linear_model.ElasticNet(1e-06, l1_ratio=1.0, max_iter=1000000, normalize=True, random_state=1, tol=1e-05, warm_start=False),
# 'ElasticNet3': linear_model.ElasticNet(1e-05, l1_ratio=0.5, max_iter=1000000, normalize=True, random_state=1, tol=1e-05, warm_start=False),
# 'LassoLarsIC':linear_model.LassoLarsIC(criterion='bic', eps=1e-5),
# 'MNaiveBayes': naive_bayes.MultinomialNB(),
# 'NN': neural_network.MLPRegressor(hidden_layer_sizes=(50,50,50), max_`iter=1000, ),
# 'KRidge': kernel_ridge.KernelRidge(alpha=10**1.5, kernel='polynomial', gamma=None, degree=1, coef0=1, kernel_params=None),
# 'RF': ensemble.RandomForestRegressor(n_estimators=50, min_samples_leaf=50),
}
Output shows $R^{2}$ scores for each regressor in each of the cross-validated trials
n_splits = 4
df_clf = pd.DataFrame(index=sk_regressors.keys(), columns=['CV' + str(i+1) for i in range(n_splits)])
for clf in sk_regressors:
df_clf.loc[clf, :] = model_selection.cross_val_score(sk_regressors[clf], train, y=df_analysis['YTW'], cv=model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=1), scoring='r2')
df_clf.loc[:, 'AVG'] = df_clf.mean(axis=1)
df_clf.applymap(lambda x: '{:.1f}%'.format(x*100))
clf = sk_regressors['ElasticNet']
train = pipeline.fit_transform(train)
predicted = clf.fit(train, df_analysis['YTW']).predict(train)
fig, ax = plt.subplots(1, figsize=(15,8))
ax.scatter(predicted, df_analysis['YTW'])
ax.text(0.90, 0.95, r"$R^2$ = {:.2f}".format(metrics.r2_score(df_analysis['YTW'], predicted)), transform=ax.transAxes, fontsize=14,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.7))
ax.set_xlabel("Predicted YTW")
ax.set_ylabel("Observed YTW");
Bokeh doesn't deal well with spaces... Creating a separate dataframe for that and applying other formatting specifics.
results = engineer_features(df_clean, excl_no_data_columns=None, drop_perpetual_maturities=True, ytw_cutoff=25,
maturity_cutoff=100, fill_mean_columns=None, country_list=country_list, fillna=np.nan,
show_exclusion_metrics=False)
results['YTW_true'] = results['YTW']
results['YTW_predicted'] = clf.predict(train)
results['YTW_diff'] = results['YTW'] - results['YTW_predicted']
results = results.drop_duplicates(subset=bond_id_columns)
leverage_groups = [0, 2, 4, 5, 6, 8, 10]
for i in range(len(leverage_groups)-1):
results.loc[results["Net Debt/EBITDA"].between(leverage_groups[i],leverage_groups[i+1]),"Leverage"] = "{:.0f}-{:.0f}x".format(leverage_groups[i], leverage_groups[i+1])
results['Next Call Date'] = results['Next Call Date'].apply(lambda x: x.strftime('%Y-%m-%d') if isinstance(x, datetime.datetime) else '-')
results['Next Call Price'] = results['Next Call Price'].apply(lambda x: str(x) if pd.notna(x) else '-')
results['Leverage'] = results['Leverage'].fillna('-')
results['Net Debt/EBITDA'] = results['Net Debt/EBITDA'].apply(lambda x: '{:.2f}x'.format(x) if pd.notna(x) else '-')
results['Net Debt/EBITDA Fwd (Parent)'] = results['Net Debt/EBITDA Fwd (Parent)'].apply(lambda x: '{:.2f}x'.format(x) if pd.notna(x) else '-')
results['Total Debt/Capital (Parent)'] = results['Total Debt/Capital (Parent)'].apply(lambda x: '{:.1f}%'.format(x) if pd.notna(x) else '-')
sorted_results_columns = [y for x in [sorted_results_mapping[grp] for grp in group_categories] for y in x]
results_chart = results.drop("Security ID", axis=1).drop_duplicates()
results_chart.columns = [column.replace(" ", "_") for column in results_chart.columns]
results_chart.columns = [column.replace("/", "_") for column in results_chart.columns]
results_chart.columns = [column.replace("(", "") for column in results_chart.columns]
results_chart.columns = [column.replace(")", "") for column in results_chart.columns]
results_chart.columns = [column.replace("%", "pct") for column in results_chart.columns]
Output a scatterplot of all bond issues with X axis showing 'predicted' YTW and Y axis showing the observed YTW. Size the bubbles according to the Amount Outstanding of the Issue and color each bubble according to the bond's rating.
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, output_file
import holoviews as hv
hv.extension('bokeh')
hover = HoverTool(tooltips=[
("Issuer", "@Issuer"),
("Currency", "@Currency"),
("Coupon", "@Coupon"),
("Maturity", "@Maturity_Date{%F}"),
("Seniority", "@Seniority"),
("Rating", "@Rating"),
("Amt. Outstanding", "@Amount_Outstanding m"),
("Sector", "@{Sector}"),
('Country', '@Country_of_Incorporation'),
("LTM Leverage", "@Net_Debt_EBITDA"),
("NTM Leverage", "@Net_Debt_EBITDA_Fwd_Parent"),
("Total Debt/Capital %", "@Total_Debt_Capital_Parent"),
("Call Price", "@Next_Call_Price"),
("Call Date", "@Next_Call_Date"),
("Price", "@Price"),
('Current Yield', "@Current_Yield"),
('3m % change', '@Price_Change_3m_pct'),
("YTW", "@YTW_true"),
("YTW est.", "@YTW_predicted"),
],
formatters={
'Maturity_Date':'datetime',
},
)
vdims = ['Issuer', 'Seniority', 'Rating', 'Coupon', 'Sector','Country_of_Incorporation', 'Price', 'Maturity_Date', 'Currency',
'Net_Debt_EBITDA', 'Net_Debt_EBITDA_Fwd_Parent', 'Total_Debt_Capital_Parent', 'Leverage', 'Amount_Outstanding',
'Next_Call_Price', 'Next_Call_Date', 'Price_Change_3m_pct', 'Current_Yield', 'YTW_true', 'YTW_predicted']
kdims = ['YTW_predicted', 'YTW_true']
df_plotting = results_chart
ratings = [rating for rating in ratings_sorted if rating in (set(df_plotting.Rating.unique()) - set(['', '-', 'NR']))]
hv_palette = hv.Palette('RdYlGn', samples=len(ratings), reverse=False)
palette = hv.plotting.util.process_cmap(hv_palette, ncolors=len(ratings), categorical=True)
explicit_mapping = dict([(k, v) for k, v in zip(ratings, palette)])
opts = dict(tools=[hover], toolbar='right', show_grid=True, width=1150, height=800, alpha=0.7,
size_index='Amount_Outstanding', color_index='Rating', cmap=explicit_mapping, show_legend=False, scaling_factor=0.04, framewise=True)
scatter = hv.Scatter(data=df_plotting, kdims=kdims, vdims=vdims, group='All Countries').options(**opts)
chart_list = [scatter]
for sector in sorted(df_plotting.Sector.unique()):
scatter_sector = hv.Scatter(data=df_plotting[(df_plotting["Sector"] == sector)],
kdims=kdims, vdims=vdims, group=sector).options(**opts)
chart_list.append(scatter_sector)
hv.Layout(chart_list).options(tabs=True, toolbar='right', shared_axes=False).cols(1)
import shutil
CHART_MD_TEMPLATE = '''---
title: {country}
layout: page
chart: output
---
[Back to screen list](../bond_screen.html)
{{% raw %}}
<iframe src="/charts/renders/{chart_name}.html"
style="max-width = 100%; max-height = 100%"
sandbox="allow-same-origin allow-scripts"
width="1200"
height="1000"
scrolling="no"
seamless="seamless"
frameborder="0">
</iframe>
{{% endraw %}}
'''
try:
shutil.rmtree('charts')
except Exception as e:
print('Exception', e)
try:
shutil.rmtree('../github_pages/mkangrga.github.io/mlprojects/charts')
except Exception as e:
print('Exception', e)
os.makedirs('charts/renders')
hv.renderer('bokeh').save(hv.Layout(chart_list).options(tabs=True, toolbar='right', shared_axes=False).cols(1),'charts/renders/{}'.format('all_countries'))
with open('charts/' + 'all_countries' + '.md', 'w') as f:
f.write(CHART_MD_TEMPLATE.format(country='All Countries', chart_name='all_countries'))
for country in country_list:
df_plotting = results_chart[results_chart["Country_of_Incorporation"] == country]
opts = dict(tools=[hover], toolbar='right', show_grid=True, width=1200, height=800, alpha=0.7,
size_index='Amount_Outstanding', color_index='Rating', cmap=explicit_mapping, show_legend=False, scaling_factor=0.05, framewise=True)
scatter = hv.Scatter(data=df_plotting, kdims=kdims, vdims=vdims, group=country).options(**opts)
chart_list = [scatter]
for sector in sorted(df_plotting.Sector.unique()):
scatter_sector = hv.Scatter(data=df_plotting[df_plotting["Sector"] == sector],
kdims=kdims, vdims=vdims, group=sector).options(**opts)
chart_list.append(scatter_sector)
chart_name = country.replace(' ','_').lower()
hv.renderer('bokeh').save(hv.Layout(chart_list).options(tabs=True, toolbar='right', shared_axes=False).cols(1),'charts/renders/{}'.format(chart_name))
with open('charts/' + chart_name + '.md', 'w') as f:
f.write(CHART_MD_TEMPLATE.format(country=country, chart_name=chart_name))
shutil.copytree('charts', '../github_pages/mkangrga.github.io/mlprojects/charts')
For more interactive visualization, an active Python server is needed. For viewing on machines without python, this can be achieved by running a bokeh server (below) and pointing the browser to the server IP address and port specified in the Bokeh server instance.
hv.renderer('bokeh').save(hv.Layout(chart_list).options(tabs=True, toolbar='right', shared_axes=False).cols(1),'output')
hv.renderer('bokeh').save(hv.Layout(chart_list).options(tabs=True, toolbar='right', shared_axes=False).cols(1), 'output', fmt='png')
def process_index(group_categories, sorted_results_mapping):
tuple_list = []
for category in group_categories:
elements = sorted_results_mapping[category]
for element in elements:
tuple_list.append((category, element))
return tuple_list
results_multiindex = pd.MultiIndex.from_tuples(process_index(group_categories, sorted_results_mapping))
results_present = pd.DataFrame(data=results[sorted_results_columns].values, index=results.index, columns=results_multiindex)
results_present.fillna('-').to_excel('results_output.xlsx')
This is meant to assist with analyzing the impact each feature has on the output by manually sensitizing each feature in Excel and immediately seeing the output change.
df_forensics = pd.DataFrame(columns=feature_names)
df_forensics.loc['mean',:] = df_train.mean()
df_forensics.loc['stdev',:] = df_train.std()
df_forensics.loc['coeffs',:] = clf.coef_
df_forensics = df_forensics.append(df_train)
df_forensics.insert(0, 'Intercept', 1)
df_forensics.insert(0, 'Issuer', df_analysis['Issuer'])
df_forensics.loc[['mean', 'stdev'],'Intercept'] = ''
df_forensics.loc['coeffs','Intercept'] = clf.intercept_
n_rows, n_cols = df_forensics.shape
writer = pd.ExcelWriter('fancy.xlsx', engine='xlsxwriter')
df_forensics.to_excel(writer, index=True, sheet_name='train', freeze_panes=(4,1))
workbook = writer.book
train_sheet = writer.sheets['train']
header_format = workbook.add_format({'text_wrap':'true','bold':'true','align':'center'})
header_format.set_border()
for i, col_name in enumerate(df_forensics.columns):
train_sheet.write(0, i+1, col_name, header_format)
train_sheet.set_row(0, 50)#, cell_format=header_format)
[fmt.set_font_size(8) for fmt in workbook.formats]
scaled_sheet = workbook.add_worksheet('train_scaled')
# Create scaled worksheet, since the regressors expect scaled feature inputs
for row in range(4, n_rows+1):
for col in range(3, n_cols+1):
scaled_sheet.write_formula(row, col, '=(train!{}-train!{})/train!{}'.format(xl_rowcol_to_cell(row, col), xl_rowcol_to_cell(1, col), xl_rowcol_to_cell(2, col)))
# Add intercept and the regression formula
for row in range(4, n_rows+1):
scaled_sheet.write_formula(row, 2, '1')
train_sheet.write_formula(row, n_cols+2, '=SUMPRODUCT({}:{}, train_scaled!{}:{})'.format(xl_rowcol_to_cell(3,2), xl_rowcol_to_cell(3,n_cols),
xl_rowcol_to_cell(row,2), xl_rowcol_to_cell(row,n_cols)))
train_sheet.write(0, n_cols+2, 'Predicted YTW', header_format)
train_sheet.write(0, n_cols+1, 'Observed YTW', header_format)
for i, value in enumerate(df_analysis['YTW'].values):
train_sheet.write(i+4, n_cols+1, value)
writer.save()
For more interactive visualization, an active Python server is needed. For viewing on machines without python, this can be achieved by running a bokeh server (below) and pointing the browser to the server IP address and port specified in the Bokeh server instance.
from bokeh.server.server import Server
scatter = hv.Scatter(data=results_chart, kdims=kdims, vdims=vdims, group='Global Bonds').options(**opts)
scatter_us = hv.Scatter(data=results_chart[results_chart["Country_of_Incorporation"] == 'United States'], kdims=kdims, vdims=vdims, group='US Bonds').options(**opts)
# scatter_sector = scatter.groupby(['Country','Sector']).options(**opts)
chart_list = [scatter, scatter_us]
for sector in sorted(results_chart.Sector.unique()):
scatter_sector = hv.Scatter(data=results_chart[(results_chart["Sector"] == sector) & (results_chart["Country_of_Incorporation"] == 'United States')],
kdims=kdims, vdims=vdims, group=sector).groupby('Leverage').options(**opts)
chart_list.append(scatter_sector)
scatter_leverage = hv.Layout(chart_list).options(tabs=True, toolbar='right', shared_axes=False).cols(1)
renderer = hv.renderer('bokeh')
renderer = renderer.instance(mode='server')
app = renderer.app(scatter_leverage)#, show=True, websocket_origin='localhost:51865')
server = Server({'/': app}, port=51865, allow_websocket_origin=['*'])
server.start()
server.show('/')
server.stop()
# hv.plotting.list_cmaps()
palette = hv.Palette.colormaps['RdYlGn_r']
palette = hv.plotting.util.process_cmap(palette, ncolors=len(ratings_sorted[:-5]), categorical=True)
explicit_mapping = dict([(k, v) for k, v in zip(ratings_sorted, palette)])
ds = hv.Dataset(data=results_chart[results_chart['Country of Incorporation'] == 'United States'],
kdims=kdims, vdims=vdims)
opts = dict(tools=[hover], toolbar='right', show_grid=True, width=1000, height=800, alpha=0.6,
size_index='Amount_Outstanding', color_index='Rating', cmap=explicit_mapping, show_legend=False, scaling_factor=0.05, framewise=True)
hm = ds.to(hv.Points, kdims=kdims, vdims=vdims, group='US Bonds').options(**opts)
ds.to(hv.BoxWhisker, 'Rating', 'Net_Debt_EBITDA', groupby=['Sector']).options(width=800, height=800, xrotation=30, box_fill_color=hv.Palette('Category20'), framewise=True)
opts = dict(tools=[hover], toolbar='right', show_grid=True, width=1000, height=800, alpha=0.8,
size_index='Amount_Outstanding',color_index='Rating', cmap=explicit_mapping, show_legend=False, scaling_factor=0.05, framewise=True)
scatter_sector = hv.Points(data=results_chart[(results_chart["Country_of_Incorporation"] == 'United States')],
kdims=kdims, vdims=vdims).groupby(['Sector', 'Leverage']).options(**opts)
scatter_sector
If certain bonds tend to cluster together, run a clustering DBSCAN algo to identify clusters and examine the underlying outliers
from sklearn.cluster import DBSCAN
from sklearn import metrics
X = results[results['Country'] == 'United States'][['YTW_true', 'YTW_predicted']].values
db = DBSCAN(eps=0.66, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
# #############################################################################
# Plot result
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
results_us = results[results['Country'] == 'United States'].copy()
results_us['Label'] = labels
results_us.to_excel('outliers.xlsx')
sns.lmplot(x='Net Debt/EBITDA', y='YTW', col='Sector', hue='Rating', palette=sns.color_palette("coolwarm", n_colors=len(ratings_sorted)), hue_order=ratings_sorted,
data=df_analysis[(df_analysis['Rating'] != '-') & (df_analysis['Rating'] != '') & (df_analysis['Price'] > 80)], sharex=False, sharey=False, fit_reg=False, col_wrap=5);
sns.lmplot(x='Price', y='YTW', col='Seniority', hue='Rating', palette=sns.color_palette("coolwarm", n_colors=len(ratings_sorted)), hue_order=ratings_sorted,
data=df_analysis[(df_analysis['Rating'] != '-') & (df_analysis['Rating'] != '') & (df_analysis['Price'] > 80)], sharex=False, sharey=False, fit_reg=False, col_wrap=5);
Run the regression using all linear regressors in Scikit-Learn. Some regressors are unsuitable for this purpose or take a very long time - screen those out by running the regression on a limited dataset
# run regression using all linear regressors with default settings
from sklearn.utils.testing import all_estimators
from sklearn import base
import time
import warnings
linear_regressors = {}
for name, class_ in all_estimators(type_filter='regressor'):
if issubclass(class_, linear_model.base.LinearModel):
linear_regressors[name] = class_()
del linear_regressors['TheilSenRegressor']
time_dict = {}
for clf in linear_regressors:
start_time = time.time()
try:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
model_selection.cross_val_score(linear_regressors[clf], train[:500], y=df_analysis['YTW'][:500], cv=model_selection.KFold(shuffle=True, random_state=1), scoring='r2')
time_dict[clf] = time.time() - start_time
except:
time_dict[clf] = 100
fig, ax = plt.subplots(1)
ax.bar(time_dict.keys(), time_dict.values())
ax.set_ylim([0, 0.5])
ax.set_xticklabels(time_dict.keys(), rotation=45, horizontalalignment='right');
ax.set_ylabel('Time to complete');
Using default hyperparameters - this is not optimal and should be done using BayesianSearchCV, but a decent start.
regressor_scores = {}
for clf in linear_regressors:
if time_dict[clf] < 0.2:
regressor_scores[clf] = model_selection.cross_val_score(linear_regressors[clf], train, y=df_analysis['YTW'], cv=model_selection.KFold(shuffle=True, random_state=1), scoring='r2')
fig, ax = plt.subplots(1)
df_regressor_scores = pd.DataFrame(regressor_scores)
ax.bar(df_regressor_scores.mean().index, df_regressor_scores.mean().values)
ax.set_ylim([-0.5, 1])
ax.set_xticklabels(df_regressor_scores.mean().index, rotation=45, horizontalalignment='right');
df_analysis.groupby(['Parent Name', 'Seniority'])[['Net Debt (Parent)', 'Amount Outstanding']].sum().sort_index().unstack().to_excel('a.xlsx')